** Age-earnings profiles to identify constrained households
**  for The Effects of Changes in Local Bank Health on Household Consumption
** by Daniel Cooper and Joe Peek, ReStat 2020
/* ////////////////////////////////////////////////////////////////////////// */
clear all
set more off

* this code is divided into blocks, see notes on each block below
scalar hw_ts = 0   /* setup data for heads/wives */
scalar ae = 0      /* calculate age earnings profiles */ 

* update with relevant file paths
local psid "~/PSID_new"
local data "/shared/Joe_Peek/bankhealth_nocamels/data"

////////////////////////////////////////////////////////////////////////////////
**** I: DATA PREP **************************************************************
////////////////////////////////////////////////////////////////////////////////
/* This block starts with individual data to get time series for age and other variables 
for both heads and wives then merges in relevant household variables
	these data are most often restricted to heads because of the way the PSID is setup, 
    but have both head and wife variables so have to extend the relevant wife 'w' variables 
    to the wife's time series 

Timing: everything lagged - so 1985 observation refers to 1984 data. 
	subtract 1 from age. assume empstat the same (can't lag post 1998). assume educ 
	the same (just need max). */
	
if hw_ts {
/////////////////////////////////////////////////////////////////////////////////
*get head and wife parings from the individual data 
use `psid'/movein_out_upd15, clear

drop if year < 1972 
	*BH measures are 1985 onwards, also wife education before 1972 is not great

*extend sex forward to 2013 and 2015 (summary variable - doesn't change year to year)
xtset unique year
carryforward sex, replace

*keep heads and wives only
keep if (relhead == 1 & seqno == 1 & year < 1983) | (relhead == 10 & seqno == 1 & year >=1983) ///
| (relhead == 2 & year < 1983 & seqno == 2 )  | ((relhead == 20 | relhead == 22) & seqno == 2 & year >= 1983)

g wife = 1 if (relhead == 2 & year < 1983 & seqno == 2) | ((relhead == 20 | relhead == 22 & seqno == 2) & year >= 1983)
g head = 1 if (relhead == 1 & seqno == 1 & year < 1983) | (relhead == 10 & seqno == 1 & year >=1983)
*not keeping wives that don't have seqno 2 -- checked main data and heads don't have wife data in those years 

keep famid year unique sex age wife head 

*sex: constant - no lagging needed
*age: subtract one. don't want to introduce missings
replace age = age - 1 

/////////////////////////////////////////////////////////////////////////////////
*Merge with education data 
*Not lagging - what matters is highest education level and don't want to introduce lots of missings
* see psid_educ_data_cleaning.do
merge 1:1 unique year using `psid'/Education/education_hmr_fin, keepusing(educh educw) keep(1 3) nogen
	*merge 1 is the wives. keep

*extend educw data (with heads observation in above dataset) to  wife -- head observation above wife
sort year famid wife 
by year famid: replace educw = educw[_n+1]

/////////////////////////////////////////////////////////////////////////////////
*Merge with labor income data and head employment status (and interview month)
*labincrc is lagged (use this)
*hlabincr_adj is actual earnings year (not survey year) - but after 1998, is actually still lagged 
merge 1:1 unique year using `data'/bhdat_2015_v4, keepusing(hlabincr_adj wlabincr_adj hlabincrc wlabincrc intmo) keep(1 3) nogen
	*merge 1 is the wives. keep

*extend labor income variables to the relevant wife
	*both should have w and h labor income so can compare 
foreach var in wlabincrc wlabincr_adj hlabincrc hlabincr_adj {
	sort year famid wife 
	by year famid: g `var'2 = `var'[_n+1]
	replace `var' = `var'2 if wife == 1
	drop `var'2
}

*extend interview month to both heads and wives time series data
sort year famid wife 
by year famid: g intmo2 = intmo[_n+1]
replace intmo = intmo2 if intmo == . & intmo2 != . 

drop intmo2

/////////////////////////////////////////////////////////////////////////////////
*Merge with employment status 
* data represent actual year employment status is recorded for (year before survey year in pre-1998 data)
*leaving so consistent - can't lag post 1998 since goes to every other year 
merge 1:1 unique year using `psid'/Employment/hmr_psid_employment_fin, keepusing(hempstat wempstat) keep(1 3) nogen
	*conflicting years AND non-heads or wives. employment file has everyone in all years. 
	*importing heads and wives, keep(1 3) 

*employment status is already full for wife and head since employment file has both. doesn't need to be extended. 

/////////////////////////////////////////////////////////////////////////////////
*Merge with business income 
*already lagged
merge m:1 unique year using "`data'/businc_farminc_7613.dta", keepusing(both_abusinc_7692) keep(1 3) nogen

*merge in for 2015 (not in original dataset)
merge m:1 unique year using "`psid'/Income/income_all_hmr.dta", keepusing(wbusinc hbusinc) keep(1 3) nogen

ren both_abusinc_7692 babusinc
ren wbusinc wabusinc
ren hbusinc habusinc

*extend business income variables to the relevant wife (post-1992 when head and wife 
* are seperated out-- prior to that business income is measured for head and wife combined) 
sort year famid wife
by year famid: replace wabusinc = wabusinc[_n+1]

*extend both business income to both of them for pre-1992 
by year famid: g babusinc2 = babusinc[_n+1]
replace babusinc = babusinc2 if babusinc == . & babusinc2 != . 
drop babusinc2 

/////////////////////////////////////////////////////////////////////////////////
*Merge with weeks unemployed
*wife missing data in 77 and 78, not to worry bc BH is 1985 on
*already lagged 

merge 1:1 unique year using `psid'/Employment/hmr_psid_employment_fin, keepusing(wksueh wksuew wksuehe wksuehu wksuewe wksuewu) keep(1 3) nogen

*combine variables 
foreach v in h w {
	g `v'ue_wks = wksue`v'
	replace `v'ue_wks = wksue`v'e if year >= 1976 & year <= 1993
	replace `v'ue_wks = wksue`v'u if year >= 1976 & year <= 1993 & wksue`v'e == .
}

*extend weeks unemployed variable to the relevant wife 
sort year famid wife 
by year famid: replace wue_wks = wue_wks[_n+1]

drop wksueh wksuew wksuehe wksuehu wksuewe wksuewu

/////////////////////////////////////////////////////////////////////////////////

*keep other person's income to compare to. so head observation will have both h and w labor income variables; same for wife 
g labincrc_oth = hlabincrc if wife == 1
replace labincrc_oth = wlabincrc if head == 1 
g labincr_adj_oth = hlabincr_adj if wife == 1
replace labincr_adj_oth = wlabincr_adj if head == 1 

*mark if the head has a wife
sort year famid wife
by year famid: g hwife = 1 if wife[_n-1] == 1 

*change labincrc_oth to missing if there is no wife 
replace labincrc_oth = . if hwife != 1 & head == 1 

*create independent series
ren educh heduc
ren educw weduc
foreach var in labincrc labincr_adj educ abusinc empstat ue_wks {
	g `var' = h`var' if head == 1
	replace `var' = w`var' if wife == 1
	drop w`var' h`var' 
}

save `data'/head_wife_ts, replace
}

////////////////////////////////////////////////////////////////////////////////
**** II: AGE-EARNINGS PROFILES *************************************************
////////////////////////////////////////////////////////////////////////////////
/*seperate age earnings profiles and deviations for husband and wife based on their 
individual full time series data then sort back into households
	in main data, constrained if either head or wife is individually constrained. 
	also mark if both constrained.

Timing: estimate profile and deviations on a lag - data created this way above
	need to use employment status from survey year. So in 2005, have 2004 income with 
	2005 employment status. (can't lag employment status once PSID goes every other year) 

Sample restrictions and estimation approach are discussed in detail in the 
online appendix.  Further details are available upon request */

if ae {
*get individual time series for each head and wife. 
	*can move between households, so want full history to create age-earnings. then sort back into HH

use `data'/head_wife_ts, clear

*quarters
g mtime = ym(year, intmo)
g qtime = qofd(dofm(mtime)) 
g qtr = quarter(dofq(qtime))
drop mtime qtime 

sort year qtr
su year if qtr!=.

drop if qtr == . 
drop if year < 1985

////////////////////////////////////////////////////////////////////////////////
*********
*CLEANING
*********
xtset unique year

//////////////////////////////////////////////
*AGE FIXES
*instances where age will jump up for one period and then back down to its expected value. consider typo and fix.
*after fix these typos, see if ages are on the same 'age path'. whichever path is dominant for the person, revise all other ages to match that path
*then fix if age repeats more than twice in a row.

*note that age can deviate by 1 year in either direction from its expected value due to when in the year the person was interviewed. 
	*ex: interviewed july 1990, age 26. turn 27 in october 1990. turn 28 in october 1991. interviewed again december 1991. so jumped from 26 to 28 from 1990 to 1991.
	*can also happen where age does not change between years b/c of when interview months occur between waves 
*in last step smooth over these to clean up noise 

*NOTE: This procedure is explained further in the online appendix to the paper. 

replace age = . if  age >= 998
	*code is usually 999, but substracted 1 when lagged above

*creating birth year for first observation
sort unique year
by unique: g birthyear = year[1] -  age[1]
*if first few observations are missing, still make sure you get a birthyear
by unique: replace birthyear = year[2] -  age[2] if  age[2] != . &  age[1] == .  
by unique: replace birthyear = year[3] -  age[3] if  age[3] != . &  age[2] == . &  age[1] == . 
	
*assign minimum birthyear that exists in each unique. sometimes vary because sometimes age doesn't change between waves due due to interview timing 
bysort unique: ereplace birthyear = min(birthyear)

*create age path based on birthyear
g age_path = year - birthyear 

*count how often households are off of the age path based on actual year and calculated birthyear 
	*note that if first obs wrong, then assume entire age path wrong
g off_path = 1 if  age != age_path &  age != age_path + 1 &  age != age_path - 1
by unique: egen all_off_path = sum(off_path)

*count what fraction of observations are off the path 
bys unique: egen obs = count(unique)
g off_path_frac = all_off_path / obs 

*create a second age path for people whos off_path fraction is greater than 1/2 (idea is to get an age path that fits better with reported ages
	*start with the next non-missing age. If first observation was wrong, all other observations would be off the path 
sort unique year
by unique: g birthyear2 = year[2] -  age[2] if off_path_frac > .5
by unique: replace birthyear2 = year[3] -  age[3] if  age[3] != . &  age[2] == . &  age[1] == .  & off_path_frac > .5

*assign minimum birthyear for the second path
bysort unique: ereplace birthyear2 = min(birthyear2)

*create age path based on 2nd birthyear
g age_path2 = year - birthyear2

*count when age does not equal age implied by age paths (either of the two age paths)
g age_bad = 1 if (age != age_path &  age != age_path + 1 &  age != age_path - 1) & off_path_frac <= .5
replace age_bad = 1 if (age != age_path2 &  age != age_path2 + 1 &  age != age_path2 - 1) & off_path_frac > .5

*replace age with age path if doesn't equal age path for just one period (i.e. a one period deviation)
*this also applies if the first or last obs is off the path and the subsequent/preceeding is not 
bys unique: replace age = age_path if age_bad == 1 & age_bad[_n-1] != 1 & age_bad[_n+1] != 1 & off_path_frac <= .5
bys unique: replace age = age_path2 if age_bad == 1 & age_bad[_n-1] != 1 & age_bad[_n+1] != 1 & off_path_frac > .5
	
bys unique: replace age_bad = 0 if age_bad == 1 & age_bad[_n-1] != 1 & age_bad[_n+1] != 1 & off_path_frac <= .5
bys unique: replace age_bad = 0 if age_bad == 1 & age_bad[_n-1] != 1 & age_bad[_n+1] != 1 & off_path_frac > .5

*how big is the deviation? 
g age_path_dev = abs(age - age_path) if off_path_frac <= .5
replace age_path_dev = abs(age - age_path2) if off_path_frac > .5

*generally individuals reported age switches from one path to another. so see which path is majority and follow that path 

*indicate 'spells' of age_path deviations (within 1)  [spell = time off one path and on another]
	*even if the age path pattern is 1-2-1, want to group the 1s together. so create a new xtset index. 
sort unique age_path_dev
g index = _n 
xtset unique index
tsspell age_path_dev, cond(age_path_dev[_n-1] == age_path_dev + 1 | age_path_dev[_n-1] == age_path_dev | age_path_dev[_n-1] == age_path_dev - 1) 

by unique: replace _spell = _spell[_n+1] if _spell == 0 	
	*spell will be zero on first obs of a new unique because can't compare to _n-1
	
xtset unique year 

*generate a variable for new spell so that can modify age path as needed
g new_spell = 1 if _spell != _spell[_n-1]

*create new spell if age decreases (even if by 1 - wouldn't be caught by the above spell creation)
by unique: replace new_spell = 1 if age < age[_n-1]

*index spells 
egen spell2 = group(unique new_spell year)
carryforward spell2, replace

*see which spell is the majority
bys unique spell2: egen count_dev = count(year)
bys unique spell2: g frac_dev = count_dev / obs

xtset unique year 

by unique: egen max_spell = max(frac_dev)
by unique: g maj_spell = 1 if max_spell == frac_dev

*create age path based on the majority spell (i.e assume path on longest is the correct age trajectory)
by unique: g birthyear3 = year - age if maj_spell == 1 
by unique: ereplace birthyear3 = min(birthyear3)
g age_path3 = year - birthyear3 

*reassign age to follow the majority age path 
	*vast majority of observations: will just be the one age path because most follow one path and already fixed obvious typos 
replace age = age_path3 if maj_spell != 1 

*if time on different path split evenly then defer to original age path -- maj_spell will be one
*this will indicate if there are multiple spells with the same frac_dev and thus both max_spells
by unique: egen max_spell1 = max(spell) if frac_dev == max_spell
by unique: egen max_spell2 = min(spell) if frac_dev == max_spell

replace age = age_path if max_spell1 != max_spell2 & frac_dev != 1 // if max spell1 != max spell2 then there are multiple 'majority' spells

*see if age remains the same for more than 2 periods -- the above method doesn't catch these because age_path_dev just increases by 1 consistently 
by unique: g age_same = 1 if age == age[_n-1] & age == age[_n-2] & age != . & age[_n-1] != . & age[_n-2] != .
replace age_same = 1 if age_same[_n+1] == 1
replace age_same = 1 if age_same[_n+2] == 1

*apply age path to these
replace age = age_path if age_same == 1 & (off_path_frac <= .5 & frac_dev == 1) | (max_spell1 != max_spell2 & frac_dev != 1)
replace age = age_path2 if age_same == 1 & off_path_frac > .5 & frac_dev == 1
replace age = age_path3 if age_same == 1 & frac_dev != 1 & max_spell1 == max_spell2

*now just enforcing the relevant age path. gets rid of age changing by 0 or 2 year to year because of interview month relative to birthday.
*which introduces unneccessary noise.  (We need paths increasing by 1 for age/earnings calculations)
*this also will fix the few remaining situations where age is bouncing around the relevant age path wtih deviations like 1-1-2-2-2-3-3
replace age = age_path if ((off_path_frac <= .5 & frac_dev == 1) | (max_spell1 != max_spell2 & frac_dev != 1)) 
replace age = age_path2 if off_path_frac > .5 & frac_dev == 1 
replace age = age_path3 if frac_dev != 1 & max_spell1 == max_spell2

drop birthyear* age_path* off_path all_off_path off_path_frac obs maj_spell* max_spell* _* age_bad spell2  frac_dev new_spell age_same


/* SETUP sample for age/earnings calculations as discussed online appendix
(dropping students, business owners etc.) */


////////////////////////////////////////////////////////////////////////////////
*mark if household jumps in and out of psid 
xtset unique year
by unique: g yr_gap = 1 if year != L.year + 1 & year < 1998 & _n != 1
by unique: replace yr_gap = 1 if year != L2.year + 2 & year > 1998 & _n != 1	

*mark if jump in and out of psid for only one period 
xtset unique year
by unique: g yr_gap1 = 1 if year != L.year + 1 & year == L2.year + 2 & year < 1998 & _n != 1
by unique: replace yr_gap1 = 1 if year != L2.year + 2 & year == L4.year + 4 & year > 1998 & _n != 1	

* Drop the Latino sample (consistent with our baseline sample)
drop if unique >=7001001 & unique<=9308399

///////////////////
**DEFINE UNEMPLOYED
g unemployed = 1 if ue_wks > 0 & ue_wks != .

///////////////////
**BUSINESS OWNERS  (See online appendix for more details)
*1. mark business owners based on asset income
g bus = 0
replace bus = 1 if abusinc != 0 & abusinc != . &  year > 1992
replace bus = 1 if babusinc != 0 & babusinc != . & year <= 1992
		*only have a joint variable before 1992. have to assign head and wife as business owners based on it

*2. if jump out of business owning for only one period then say actually business owner
*Mark if jump out of business owning for only one year
g nobus = 0
replace nobus = 1 if bus == 0 

*mark if beginning or end of time series is in the period (including year gap)
by unique: g end_beg = 1 if _n == 1 | _n == _N | yr_gap == 1 | yr_gap[_n+1] == 1 

*how long business owner?	
tsspell bus, fcond((nobus != nobus[_n-1] ) | (_n == 1))
replace _seq = . if nobus == 0
replace _spell = . if nobus == 0 
*mark how long sequence is 
bys unique _spell: egen nobus_length = count(_seq) 

*replace length to missing if beginning or end
bys unique _spell: ereplace end_beg = max(end_beg) if _spell != .
replace nobus_length = . if end_beg == 1

xtset unique year 
drop end_beg _*

*implement change
replace bus = 1 if nobus_length == 1 
*total: 15805 obs, 4924 people
*change: 1,135 obs

///////////////////
**STUDENTS
g student = 0
replace student = 1 if empstat==7
*3,633 obs; 2,510 people

*ENFORCING STUDENT BEFORE 18 
replace student = 1 if age < 18 
*535 additional obs; 458 people

///////////////////
**RETIRED
* code determines whether someone is retired beyond enforcing that anyone is retired after age 65 (for age/earnings purposes)
g retired = 0 

*1. ENFORCING RETIRED AFTER 64
replace retired = 1 if age >= 65
*29,786 obs; 4,610 people

*2. not actually retired if labor income is more than 20% of their last non-retired income 
*what is income in last observation before becomes retired? 
by unique: g last_inc = labincrc if empstat[_n+1] == 4 & empstat != 4 
by unique: replace last_inc = last_inc[_n-1] if last_inc[_n-1] != . & _n != 1 & empstat == 4 
	*can have multiple last_incs per unique if multiple periods of retired 
	*last_inc reevaluates every time empstat changes from 4 
	
*not retired if income > 20% of last non-retired income
g not_retired = 1 if labincrc > .2 * last_inc & empstat == 4 & last_inc != . 
replace retired = 1 if empstat == 4 & not_retired != 1 
*total retired = 36,523. 5,831 people
*empstat = 4 and retired != 1: 1,997 obs; 1,576 people
	
*3. if was retired for at least 5 years and then comes out of retirement, then keep retired  
*mark if stretch of retirement was at least 5 years consecutive
*YEARS not periods. If don't have intervening years, don't make assumptions and leave as is. 
	*must actually have 5 last YEARS as retired. 
	*post 1998 when every other year, must have last 4 years. (two periods) 
	*Annual data: if at least 4 out of the 5 past years are non-missing and retired in all those 4, then this counts as a 5 year period 	

*mark when only one of the last 5 years is missing 
g yr_gap5 = 0 
by unique: replace yr_gap5 = 1 if (L.retired == . | L2.retired == . | L3.retired == . | L4.retired == . | L5.retired == .) & year < 1998
by unique: replace yr_gap5 = 0 if ((L.retired == . & L2.retired == .) | (L.retired == . & L3.retired == .) | (L.retired == . & L4.retired == .) | (L.retired == . & L5.retired == .) ///
| (L2.retired == . & L3.retired == .) | (L2.retired == . & L4.retired == .) | (L2.retired == . & L5.retired == .) | (L3.retired == . & L4.retired == .) | (L3.retired == . & L5.retired == .) ///
| (L4.retired == . & L5.retired == .)) & year < 1998
*if the first year doesn't exist because before sample, then don't count in this. 
*mark first obs
by unique: g obs1 = 1 if _n == 1
replace yr_gap5 = 0 if L5.retired == . & L4.obs1 == 1
	*retired is missing because before start of ts and L4 is the first observation

*Mark if retired for past 5 years continuously 
*Annual data: if at least 4 out of the 5 past years are non-missing and retired in all those 4, then this counts as a 5 year period 	
by unique: g retired_5 = 1 if L.retired == 1 & L2.retired == 1 & L3.retired == 1 & L4.retired == 1  & L5.retired == 1 & year < 1998 
by unique: replace retired_5 = 1 if (retired == 1 | retired == .) & (L.retired == 1 | L.retired == .) & (L2.retired == 1 | L2.retired == .) & (L3.retired == 1 | L3.retired == .) ///
& (L4.retired == 1 | L4.retired == .) & (L5.retired == 1 | L5.retired == .) & year < 1998 & yr_gap5 == 1 
	*retired will only be missing if the year is missing 
*every other year: 
by unique: replace retired_5 = 1 if L2.retired == 1 & L4.retired == 1  & year > 2000
*at 1999, need one 2 year gap and then every year
by unique: replace retired_5 = 1 if L2.retired == 1 & L3.retired == 1 & L5.retired == 1 & year == 1999

*year at which this happens
by unique: g year_retired_5 = year if retired_5 == 1
by unique: ereplace year_retired_5 = min(year_retired_5)
	*first time this happens, take min
*mark all retired afterwards
by unique: replace retired = 1 if year >= year_retired_5
	*259 additional obs; affects 147 people
*what if retired from beginning? then not_retired != 1 so just counted as retired 

*4. if retired for only one year, then not retired 

*how long retired?	
tsspell retired, fcond((retired != retired[_n-1] ) | (_n == 1))
replace _seq = . if retired == 0
replace _spell = . if retired == 0 
*mark how long sequence is 
bys unique _spell: egen retired_length = count(_seq) 

xtset unique year 

*replace length to missing if beginning or end of individuals ts. One year gap in survey years is ok. 
by unique: g end_beg = 1 if _n == 1 | _n == _N | (yr_gap == 1 & yr_gap1 != 1) | (yr_gap[_n+1] == 1 & yr_gap1[_n+1] != 1)
*if post 1998 don't want to consider a jump if missing survey years on both sides, too much unkown 
by unique: replace end_beg = . if end_beg == 1 & L2.end_beg == 1 & F2.end_beg == 1 & year > 1998 
	*no instances of this, so don't worry about it. this code wouldn't fully fix the problem, but it shows that there is no problem
bys unique _spell: ereplace end_beg = max(end_beg) if _spell != .
replace retired_length = . if end_beg == 1

xtset unique year

*if retired for just one year (not at the beginning or end of time series) then mark as not retired
by unique: replace retired = 0 if retired_length == 1 
*lose 694 observations; 610 people

drop retired_5 year_retired_5 not_retired last_inc _*

///////////////////
**NILF (not in labor force)
*mark if not in the labor force. do not include in age/earnings profile estimation or deviation determination. 
g nilf = 0

*1. not in labor force if labor income is zero and not unemployed
replace nilf = 1 if labincrc == 0  & unemployed != 1 
*62,341 obs; 12,490 people
*31,724 obs additional to retired; 9,029 people affected

*2. nilf if income drops to less than 20% of previous income. unless unemployed then still in lf  
*If income then rises more than 20% it becomes the new standard that subsequent incomes are compared to 
	*By PERIOD not year
by unique: g last_inc = labincrc 
by unique: replace last_inc = last_inc[_n-1] if  _n != 1 &  last_inc < .2*last_inc[_n-1]
replace nilf = 1 if last_inc != labincrc & unemployed != 1
*additional obs: 5,732; 3,635 people affected

*3. nilf if maximum income is less than 5000 for all in lf observations 
by unique: egen max_inc_lf = max(labincrc) if nilf == 0 & retired != 1 & student != 1 & bus != 1
*ALL observations then become not in labor force 
	*even observations that are student, retired, business income
	*this is because don't want something like student- new nilf b.c low inc-student to be caught by the next rule just because didn't change the student obs to also nilf 
by unique: ereplace max_inc_lf = max(max_inc_lf)
replace nilf = 1 if max_inc_lf < 5000
	*even for unemployment observations - also includes people where the only LF obs are unemployment
*3530 obs additional; 1,532 people

*4. If nilf for just one period (not at beginning or end of individual's time series), then consider in LF
	*ok if 1 year missing on either side, still consider a 1 year jump.  
	*if post 1998, then can only have a missing survey year on one side. otherwise don't know what's happening for 3 years on either side of jump. 
*how long out of the labor force?	
tsspell nilf, fcond((nilf != nilf[_n-1] ) | (_n == 1))
replace _seq = . if nilf == 0
replace _spell = . if nilf == 0 
*mark how long sequence is 
bys unique _spell: egen nilf_length = count(_seq) 
xtset unique year

*replace length to missing if beginning or end of individuals' time series. One year gap in survey years is ok. 
drop end_beg 
by unique: g end_beg = 1 if _n == 1 | _n == _N | (yr_gap == 1 & yr_gap1 != 1) | (yr_gap[_n+1] == 1 & yr_gap1[_n+1] != 1)
*if post 1998 don't want to consider a jump if missing survey years on both sides, too much unkown 
by unique: replace end_beg = . if end_beg == 1 & L2.end_beg == 1 & F2.end_beg == 1 & year > 1998 
	*no instances of this, so don't worry about it. this code wouldn't fully fix the problem, but it shows that there is no problem
bys unique _spell: ereplace end_beg = max(end_beg) if _spell != .
replace nilf_length = . if end_beg == 1

drop _*
xtset unique year
	
*see if unemployed in previous period (ONLY LOOKING AT ANNUAL OBS: where can tell)
*not making any changes. just checking.
count if nilf_length == 1  & L.unemployed == 1 & retired != 1 & student != 1 & bus != 1
 count if nilf_length == 1  & L.unique == unique & retired != 1 & student != 1 & bus != 1
	*494 / 1805
	*about 27% -- discouraged?
replace nilf = 0 if nilf_length == 1 
*lost 4,593 observations; affects 3,753 people

*5. If in the LF for just one year and income less than $5,000 or less than 20% rule then out of LF
*how long in the labor force?	
g lf = 0 
replace lf = 1 if nilf == 0

tsspell lf, fcond((lf != lf[_n-1] ) | (_n == 1))
replace _seq = . if lf == 0
replace _spell = . if lf == 0 
*mark how long sequence is 
bys unique _spell: egen lf_length = count(_seq) 
xtset unique year

*replace length to missing if beginning or end of ts. One year gap in survey years is ok. 
drop end_beg 
by unique: g end_beg = 1 if _n == 1 | _n == _N | (yr_gap == 1 & yr_gap1 != 1) | (yr_gap[_n+1] == 1 & yr_gap1[_n+1] != 1)
*if post 1998 don't want to consider a jump if missing survey years on both sides, too much unkown 
by unique: replace end_beg = . if end_beg == 1 & L2.end_beg == 1 & F2.end_beg == 1 & year > 1998 
	*no instances of this, so don't worry about it. this code wouldn't fully fix the problem, but it shows that there is no problem
bys unique _spell: ereplace end_beg = max(end_beg) if _spell != .
replace lf_length = . if end_beg == 1

xtset unique year

*count if lf_length == 1 & bus != 1 & student != 1 & retired != 1 
*452 lf length 
	*168 are unemployed in that 1 observation 
*0 nilf length 	

*make change	
replace nilf = 1 if lf_length == 1 &  (labincrc != last_inc | labincrc < 5000)
*239 obs; 225 people affected
*out of the 452 number above, this dealt with 213 instances 
*239 instances left where income is greater than 5,000 and greater than 20% rule 
	
drop _* 
xtset unique year	

///////////////////	
*mark if have at least 5 observations of in the labor force and not unemployed - criteria for inclusion in individual profile estimation.
	*not necessary for the overall estimation (across all individuals and ages) - just have to be in lf for that 
bys unique: egen employed_obs = count(unique) if retired != 1 & nilf != 1 & bus != 1 & student != 1 & unemployed != 1 
*spread employed_obs to all observations
bys unique: ereplace employed_obs = max(employed_obs)
replace employed_obs = 0 if employed_obs == . 

///////////////////
*education categories
g less_hs = 0
replace less_hs = 1 if educ==1

g hs_degree = 0 
replace hs_degree = 1 if educ==2

g some_college = 0 
replace some_college = 1 if educ==3

g college_adv = 0 
replace college_adv = 1 if educ==4 | educ==5

la var less_hs "Less than High School"
la var hs_degree "High School Degree"
la var some_college "Some College"
la var college_adv "College and Advanced Degrees"

bysort unique: ereplace less_hs=max(less_hs)
bysort unique: ereplace hs_degree=max(hs_degree)
bysort unique: ereplace some_college=max(some_college)
bysort unique: ereplace college_adv=max(college_adv)

*only assign highest achieved education level. 
bys unique: replace less_hs = 0 if hs_degree == 1 

bys unique: replace hs_degree = 0 if some_college == 1
bys unique: replace less_hs = 0 if some_college == 1

bys unique: replace some_college = 0 if college_adv == 1 
bys unique: replace hs_degree = 0 if college_adv == 1 
bys unique: replace less_hs = 0 if college_adv == 1 

///////////////////
*generating sex and education groups
g hss = 1 if less_hs==1 
g hsd = 1 if hs_degree==1 
g sc = 1 if some_college==1 
g cad=1 if college_adv==1 

g hss_m = 1 if less_hs==1 & sex==1
g hsd_m = 1 if hs_degree==1 & sex==1
g sc_m = 1 if some_college==1 & sex==1
g cad_m =1 if college_adv==1 & sex==1
	
g hss_f = 1 if less_hs==1 & sex==2
g hsd_f = 1 if hs_degree==1 & sex==2
g sc_f = 1 if some_college==1 & sex==2
g cad_f =1 if college_adv==1 & sex==2

g age2 = age^2
g age3 = age^3

*regressing log income onto a constant and age, age squared and age cubed for each education/age group
g alpha_i=.
g beta1=.
g beta2=.
g beta3=.

***************************
* Age/Earnings Estimation *
***************************

*estimate overall curves with everyone that in the LF (including unemployed)
*estimate individual curves for people with at least 5 observations of valid LF observations (excluding unemployed)

*estimate separately by gender

************
*Male AE Curves
************
*overall curve
reg labincrc age age2 if hss_m==1  & retired != 1 & student != 1 & nilf != 1 & bus != 1 , r
*individual alpha 
replace alpha_i=labincrc - _b[age]*age - _b[age2]*age2  if hss_m==1 & employed_obs>=5 & employed_obs!=. & retired != 1 & student != 1  & nilf != 1 & bus != 1 
replace beta1 = _b[age] if hss_m==1 
replace beta2 = _b[age2] if hss_m==1
su year if e(sample)
	
reg labincrc age age2 if hsd_m==1  & retired != 1 & student != 1  & nilf != 1 & bus != 1 , r
replace alpha_i=labincrc - _b[age]*age - _b[age2]*age2 if hsd_m==1 & employed_obs>=5 & employed_obs!=.  & retired != 1 & student != 1  & nilf != 1 & bus != 1 
replace beta1 = _b[age] if hsd_m==1
replace beta2 = _b[age2] if hsd_m==1
su year if e(sample)
		
reg labincrc age age2 if sc_m==1  & retired != 1 & student != 1  & nilf != 1 & bus != 1 , r
replace alpha_i=labincrc - _b[age]*age - _b[age2]*age2 if sc_m==1 & employed_obs>=5 & employed_obs!=.  & retired != 1 & student != 1  & nilf != 1 & bus != 1 
replace beta1 = _b[age] if sc_m==1
replace beta2 = _b[age2] if sc_m==1
su year if e(sample)

reg labincrc age age2 if cad_m==1  & retired != 1 & student != 1  & nilf != 1 & bus != 1 , r
replace alpha_i=labincrc - _b[age]*age - _b[age2]*age2 if cad_m==1 & employed_obs>=5 & employed_obs != . & retired != 1 & student != 1  & nilf != 1 & bus != 1 
replace beta1 = _b[age] if cad_m==1
replace beta2 = _b[age2] if cad_m==1
	
**************
*Female Curves
**************
reg labincrc age age2 if hss_f==1  & retired != 1 & student != 1  & nilf != 1 & bus != 1 , r 
replace alpha_i=labincrc - _b[age]*age - _b[age2]*age2 if hss_f==1 & employed_obs>=5 & employed_obs!=.  & retired != 1 & student != 1  & nilf != 1 & bus != 1 
replace beta1 = _b[age] if hss_f==1 
replace beta2 = _b[age2] if hss_f==1
su year if e(sample)

reg labincrc age age2 if hsd_f==1  & retired != 1 & student != 1  & nilf != 1 & bus != 1 , r
replace alpha_i=labincrc - _b[age]*age - _b[age2]*age2 if hsd_f==1 & employed_obs>=5 & employed_obs!=.  & retired != 1 & student != 1  & nilf != 1 & bus != 1 
replace beta1 = _b[age] if hsd_f==1
replace beta2 = _b[age2] if hsd_f==1
su year if e(sample)
		
reg labincrc age age2 if sc_f==1  & retired != 1 & student != 1  & nilf != 1 & bus != 1 , r
replace alpha_i=labincrc - _b[age]*age - _b[age2]*age2 if sc_f==1 & employed_obs>=5 & employed_obs!=.  & retired != 1 & student != 1  & nilf != 1 & bus != 1 
replace beta1 = _b[age] if sc_f==1
replace beta2 = _b[age2] if sc_f==1
su year if e(sample)

reg labincrc age age2 if cad_f==1  & retired != 1 & student != 1  & nilf != 1 & bus != 1 , r
replace alpha_i=labincrc - _b[age]*age - _b[age2]*age2 if cad_f==1 & employed_obs>=5 & employed_obs != .  & retired != 1 & student != 1  & nilf != 1 & bus != 1 
replace beta1 = _b[age] if cad_f==1
replace beta2 = _b[age2] if cad_f==1

*different intercepts for each time-person pair. take median 
bysort unique: egen alpha_i_med = median(alpha_i) if employed_obs>=5 & employed_obs!=.
bysort unique: ereplace alpha_i_med = max(alpha_i_med)

*don't want to estimate individual curves for observations where retired, student etc.
replace alpha_i_med = . if retired == 1 | student == 1 | nilf == 1 | bus == 1 

*calculating fitted earnings
g y_hat_med = alpha_i_med + beta1*age  + beta2*age2 

*calculate deviation based on individual labor income.
*in main data: constrained if one earner is constrained. also mark if both earners are constrained. 
g per_dev_med = 100*(labincrc - y_hat_med)/y_hat_med

*"bottom coding" the observations with a negative expected income, so they're not 
*considered negative income shocks from our formulas. 
replace per_dev_med = 1 if y_hat_med<0 & labincrc != .

keep per_dev_med unique year head wife famid retired student nilf bus labincrc y_hat y_hat_med age

////////////////////////////////////////////////////////////////////////////////
*RE-FORM HHs
foreach var in per_dev_med retired student nilf bus labincrc y_hat_med age {
	g w`var' = `var' if wife == 1
	g h`var' = `var' if head == 1 
}

*reassign everything to head
sort year famid wife
foreach var in per_dev_med retired student nilf bus labincrc y_hat_med age {
	by year famid: g w`var'2 = w`var'[_n-1] if _n != 1 
}

*only keep head: households
keep if head == 1

foreach var in per_dev_med retired student nilf bus labincrc y_hat_med age {
	replace w`var' = w`var'2
	drop w`var'2
	drop `var' 
}

/////////
*CALCULATE HH DEVIATION
*Deviation of HH income (head + wife)
*joint income
egen hhlabincrc = rowtotal(hlabincrc wlabincrc)

*joint fitted earnings
egen hhy_hat = rowtotal(wy_hat hy_hat)
egen hhy_hat_med = rowtotal(wy_hat_med hy_hat_med)

*joint deviation 
g hhper_dev = 100*(hhlabincrc - hhy_hat)/hhy_hat
g hhper_dev_med = 100*(hhlabincrc - hhy_hat_med)/hhy_hat_med

*"bottom coding" the observations with a negative expected income, so they're not 
*considered negative income shocks
replace hhper_dev_med = 1 if hhy_hat_med<0 & hhlabincrc != .

*a measure of the volatility of deviations from hh age earnings profiles
bysort unique year: gen hhper_sqdev = (hhper_dev_med/100)^2
bysort unique: egen avgsqdev = mean(hhper_sqdev)
replace avgsqdev = . if avgsqdev == 0
bysort unique: gen avgsqdev100 = avgsqdev/100

* predicted log income
gen lnyhat = ln(hhy_hat_med)

save `data'/age_earning_profiles_posttax_15_hmr.dta, replace
}
